This analysis utilizes the data found in a Kaggle competition where competitors seek to predict housing prices in King County, WA, USA (https://bit.ly/2lRv48E). The data set provides fields including the number of bedrooms, number of bathrooms, number of floors, square footage of the living room, square footage of the overall lot, among others. The time frame this data set covers are the years 2014 through 2015. Therefore, this analysis will utilize a number of machine learning models and techniques in order to achieve the best possible models (in this instance measured using MAPE, MAE, MSE, and R-Squared with emphasis on the MAPE score).

Data Import

raw_train_df <- fread('Data/house_price_train.csv', stringsAsFactors = F)
raw_test_df <- fread('Data/house_price_test.csv', stringsAsFactors = F)

str(raw_train_df)
## Classes 'data.table' and 'data.frame':   17277 obs. of  21 variables:
##  $ id           :integer64 9183703376 464000600 2224079050 6163901283 6392003810 7974200948 2426059124 2115510300 ... 
##  $ date         : chr  "5/13/2014" "8/27/2014" "7/18/2014" "1/30/2015" ...
##  $ price        : num  225000 641250 810000 330000 530000 ...
##  $ bedrooms     : int  3 3 4 4 4 4 4 3 4 3 ...
##  $ bathrooms    : num  1.5 2.5 3.5 1.5 1.75 3.5 3.25 2.25 2.5 1.5 ...
##  $ sqft_living  : int  1250 2220 3980 1890 1814 3120 4160 1440 2250 2540 ...
##  $ sqft_lot     : int  7500 2550 209523 7540 5000 5086 47480 10500 6840 9520 ...
##  $ floors       : num  1 3 2 1 1 2 2 1 2 1 ...
##  $ waterfront   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ view         : int  0 2 2 0 0 0 0 0 0 0 ...
##  $ condition    : int  3 3 3 4 4 3 3 3 3 3 ...
##  $ grade        : int  7 10 9 7 7 9 10 8 9 8 ...
##  $ sqft_above   : int  1250 2220 3980 1890 944 2480 4160 1130 2250 1500 ...
##  $ sqft_basement: int  0 0 0 0 870 640 0 310 0 1040 ...
##  $ yr_built     : int  1967 1990 2006 1967 1951 2008 1995 1983 1987 1959 ...
##  $ yr_renovated : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ zipcode      : int  98030 98117 98024 98155 98115 98115 98072 98023 98058 98115 ...
##  $ lat          : num  47.4 47.7 47.6 47.8 47.7 ...
##  $ long         : num  -122 -122 -122 -122 -122 ...
##  $ sqft_living15: int  1260 2200 2220 1890 1290 1880 3400 1510 2480 1870 ...
##  $ sqft_lot15   : int  7563 5610 65775 8515 5000 5092 40428 8125 7386 6800 ...
##  - attr(*, ".internal.selfref")=<externalptr>
summary(raw_train_df)
##        id                 date               price        
##  Min.   :   1000102   Length:17277       Min.   :  78000  
##  1st Qu.:2113701080   Class :character   1st Qu.: 320000  
##  Median :3902100205   Mode  :character   Median : 450000  
##  Mean   :4566440237                      Mean   : 539865  
##  3rd Qu.:7302900090                      3rd Qu.: 645500  
##  Max.   :9900000190                      Max.   :7700000  
##     bedrooms        bathrooms      sqft_living       sqft_lot      
##  Min.   : 1.000   Min.   :0.500   Min.   :  370   Min.   :    520  
##  1st Qu.: 3.000   1st Qu.:1.750   1st Qu.: 1430   1st Qu.:   5050  
##  Median : 3.000   Median :2.250   Median : 1910   Median :   7620  
##  Mean   : 3.369   Mean   :2.114   Mean   : 2080   Mean   :  15186  
##  3rd Qu.: 4.000   3rd Qu.:2.500   3rd Qu.: 2550   3rd Qu.:  10695  
##  Max.   :33.000   Max.   :8.000   Max.   :13540   Max.   :1164794  
##      floors        waterfront            view          condition    
##  Min.   :1.000   Min.   :0.000000   Min.   :0.0000   Min.   :1.000  
##  1st Qu.:1.000   1st Qu.:0.000000   1st Qu.:0.0000   1st Qu.:3.000  
##  Median :1.500   Median :0.000000   Median :0.0000   Median :3.000  
##  Mean   :1.493   Mean   :0.007467   Mean   :0.2335   Mean   :3.413  
##  3rd Qu.:2.000   3rd Qu.:0.000000   3rd Qu.:0.0000   3rd Qu.:4.000  
##  Max.   :3.500   Max.   :1.000000   Max.   :4.0000   Max.   :5.000  
##      grade         sqft_above   sqft_basement       yr_built   
##  Min.   : 3.00   Min.   : 370   Min.   :   0.0   Min.   :1900  
##  1st Qu.: 7.00   1st Qu.:1190   1st Qu.:   0.0   1st Qu.:1951  
##  Median : 7.00   Median :1564   Median :   0.0   Median :1975  
##  Mean   : 7.66   Mean   :1791   Mean   : 289.4   Mean   :1971  
##  3rd Qu.: 8.00   3rd Qu.:2210   3rd Qu.: 556.0   3rd Qu.:1997  
##  Max.   :13.00   Max.   :9410   Max.   :4820.0   Max.   :2015  
##   yr_renovated        zipcode           lat             long       
##  Min.   :   0.00   Min.   :98001   Min.   :47.16   Min.   :-122.5  
##  1st Qu.:   0.00   1st Qu.:98033   1st Qu.:47.47   1st Qu.:-122.3  
##  Median :   0.00   Median :98065   Median :47.57   Median :-122.2  
##  Mean   :  85.35   Mean   :98078   Mean   :47.56   Mean   :-122.2  
##  3rd Qu.:   0.00   3rd Qu.:98117   3rd Qu.:47.68   3rd Qu.:-122.1  
##  Max.   :2015.00   Max.   :98199   Max.   :47.78   Max.   :-121.3  
##  sqft_living15    sqft_lot15    
##  Min.   : 460   Min.   :   659  
##  1st Qu.:1490   1st Qu.:  5100  
##  Median :1840   Median :  7639  
##  Mean   :1986   Mean   : 12826  
##  3rd Qu.:2360   3rd Qu.: 10080  
##  Max.   :6210   Max.   :871200
head(raw_train_df)
##            id      date  price bedrooms bathrooms sqft_living sqft_lot
## 1: 9183703376 5/13/2014 225000        3      1.50        1250     7500
## 2:  464000600 8/27/2014 641250        3      2.50        2220     2550
## 3: 2224079050 7/18/2014 810000        4      3.50        3980   209523
## 4: 6163901283 1/30/2015 330000        4      1.50        1890     7540
## 5: 6392003810 5/23/2014 530000        4      1.75        1814     5000
## 6: 7974200948 5/20/2014 953007        4      3.50        3120     5086
##    floors waterfront view condition grade sqft_above sqft_basement
## 1:      1          0    0         3     7       1250             0
## 2:      3          0    2         3    10       2220             0
## 3:      2          0    2         3     9       3980             0
## 4:      1          0    0         4     7       1890             0
## 5:      1          0    0         4     7        944           870
## 6:      2          0    0         3     9       2480           640
##    yr_built yr_renovated zipcode     lat     long sqft_living15 sqft_lot15
## 1:     1967            0   98030 47.3719 -122.215          1260       7563
## 2:     1990            0   98117 47.6963 -122.393          2200       5610
## 3:     2006            0   98024 47.5574 -121.890          2220      65775
## 4:     1967            0   98155 47.7534 -122.318          1890       8515
## 5:     1951            0   98115 47.6840 -122.281          1290       5000
## 6:     2008            0   98115 47.6762 -122.288          1880       5092
#Check for null values
sum(is.na(raw_train_df))
## [1] 0
sum(is.na(raw_test_df))
## [1] 0

Initial Cleaning

As time series modeling will not be utilized in this analysis, the day, month, and year of each purchase will be individually parsed out rather than using the date time field.

clean_train_df <- raw_train_df
clean_test_df <- raw_test_df

# Train Data set
clean_train_df$date <- as.Date(raw_train_df$date, "%m/%d/%Y")

clean_train_df$year <- year(clean_train_df[,clean_train_df$date])
clean_train_df$month <- month(clean_train_df[,clean_train_df$date])
clean_train_df$day <- day(clean_train_df[,clean_train_df$date])
clean_train_df$day_of_week <- as.POSIXlt(as.Date(clean_train_df$date, "%m/%d/%Y"))$wday

# Test Data Set
clean_test_df$date <- as.Date(clean_test_df$date, "%m/%d/%Y")

clean_test_df$year <- year(clean_test_df[,clean_test_df$date])
clean_test_df$month <- month(clean_test_df[,clean_test_df$date])
clean_test_df$day <- day(clean_test_df[,clean_test_df$date])
clean_test_df$day_of_week <- as.POSIXlt(as.Date(clean_test_df$date, "%m/%d/%Y"))$wday

Exploratory Analysis

Histograms

In order readable visualizations, a random selection of 1000 houses will be taken for all subsequent graphs/charts. Based on the histogram outputs, it appears that at least some of the variables are not normally distributed.

set.seed(12345)

sqft_hist <- c('sqft_living', 'sqft_lot', 'sqft_above', 'sqft_living15', 'sqft_lot15')
stats_hist <- c('bedrooms', 'floors','condition', 'grade')

#Randomly Sample 1000 values
df.1000 <- clean_train_df[sample(nrow(clean_train_df), 1000),]

multiple_hist(df.1000, sqft_hist)

multiple_hist(df.1000, stats_hist)

single_hist(df.1000$yr_built, "Year Built")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

single_hist(df.1000$yr_renovated, "Year Renovated")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

single_hist(df.1000$price, "Price")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

single_hist(df.1000$sqft_basement, "Basement Area")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Scatterplots

The following scatter plots seek to visual describe the relationship between the numerical variables and the target price variable. From this sample, it can be seen that the area of the living room and number of bathrooms in particular have a strong positive relationship (an intuitive observation from a business perspective as they factors are often explicitly taken into account with regards to pricing).

#Create dataframe with only numerical variables
numerical_var <- c('bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'sqft_above', 'sqft_basement', 'yr_built', 'sqft_living15', 'sqft_lot15', 'price')
scatter_df <- clean_train_df[,..numerical_var]
var_list <- names(scatter_df)[1:(length(scatter_df)-1)]

#Create list of ggplots of each numerical variable against price
plot_list <- lapply(var_list, gg_scatter, df = scatter_df)
do.call(grid.arrange, plot_list)

Geographic Analysis

The following map allows for the visualization of the where houses of different price bands (based on quartiles) are located.

#Bin into quartiles for data visualization
df.1000$bin <- factor(Hmisc::cut2(df.1000$price, g = 4), labels = c(1:4))

colorsmap <- colors()[c(490,24,100,657)]
map <- leaflet(data.frame(df.1000)) %>%
  addTiles() %>%  # Add default OpenStreetMap map tiles
  addCircleMarkers(lng=~long, lat=~lat,
                   popup= paste0("Number of Bedrooms: ", df.1000$bedrooms, sep="\n",
                                 "Number of Bathrooms: ", df.1000$bathrooms, sep="\n",
                                 "Living Room Size: " , df.1000$sqft_living, sep="\n",
                                 "Lot Size: ", df.1000$sqft_lot, sep="\n",
                                 "Number of Floors: ", df.1000$floors, sep="\n",
                                 "Current Condition: ", df.1000$condition),
                   color= ~colorsmap,
                   group= unique(df.1000$bin)) #%>% 
# This seems to be no longer supported
  # addLegend(position = 'bottomright', colors = colorsmap, labels = unique(df.1000$bin))

# addLegend(map, position = 'bottomright', colors = colorsmap, labels = unique(df.1000$bin))

map

Outlier Analysis

Univariate Analysis

The following charts look at the distributions of each numerical value to visually see outliers. In this sample there are clear outliers for all variables other than the number of floors and the year the home was built.

for (var in numerical_var[1:(length(numerical_var)-1)]){
  univariate_outlier(clean_test_df, var)
}

## [1] "Outliers:  5"

## [1] "Outliers:  9"

## [1] "Outliers:  93"

## [1] "Outliers:  433"

## [1] "Outliers:  0"

## [1] "Outliers:  76"

## [1] "Outliers:  50"

## [1] "Outliers:  0"

## [1] "Outliers:  67"

## [1] "Outliers:  386"

Bivariate Analysis

The following charts look at the distributions of each numerical value to visually see outliers on a monthly and daily basis to help understand any temporal patterns. Once again, outliers occur in all variables other than the number of floors and the year built. It is interesting to note that relatively speaking, a greater proportion of outliers are occur on weekdays.

for (var in numerical_var[1:(length(numerical_var)-1)]){
  bivariate_outlier(clean_test_df, var)
}

Clip Outliers

As the above charts demonstrate that outliers are present, they will be clipped if they lie outside a 95% distribution band.

clipped_outliers <- lapply(clean_train_df[,..numerical_var], clip, lower_bound = .025, upper_bound = .975)
clipped_outliers_df <- as.data.table(matrix(unlist(clipped_outliers), nrow=length(unlist(clipped_outliers[1]))))
clean_train_df[,numerical_var] <- clipped_outliers_df

head(clean_train_df)
##            id       date  price bedrooms bathrooms sqft_living sqft_lot
## 1: 9183703376 2014-05-13 225000        3      1.50        1250   7500.0
## 2:  464000600 2014-08-27 641250        3      2.50        2220   2550.0
## 3: 2224079050 2014-07-18 810000        4      3.50        3980  87509.8
## 4: 6163901283 2015-01-30 330000        4      1.50        1890   7540.0
## 5: 6392003810 2014-05-23 530000        4      1.75        1814   5000.0
## 6: 7974200948 2014-05-20 953007        4      3.50        3120   5086.0
##    floors waterfront view condition grade sqft_above sqft_basement
## 1:      1          0    0         3     7     1250.0             0
## 2:      3          0    2         3    10     2220.0             0
## 3:      2          0    2         3     9     3840.5             0
## 4:      1          0    0         4     7     1890.0             0
## 5:      1          0    0         4     7      944.0           870
## 6:      2          0    0         3     9     2480.0           640
##    yr_built yr_renovated zipcode     lat     long sqft_living15 sqft_lot15
## 1:     1967            0   98030 47.3719 -122.215          1260       7563
## 2:     1990            0   98117 47.6963 -122.393          2200       5610
## 3:     2006            0   98024 47.5574 -121.890          2220      60548
## 4:     1967            0   98155 47.7534 -122.318          1890       8515
## 5:     1951            0   98115 47.6840 -122.281          1290       5000
## 6:     2008            0   98115 47.6762 -122.288          1880       5092
##    year month day day_of_week
## 1: 2014     5  13           2
## 2: 2014     8  27           3
## 3: 2014     7  18           5
## 4: 2015     1  30           5
## 5: 2014     5  23           5
## 6: 2014     5  20           2

Correlation Heatmap

heatmap_data<-clean_train_df[, !c('id','date')]
d3heatmap::d3heatmap(cor(heatmap_data))

Baseline Model Comparisons

This analysis will initially compare the results of Lasso Linear Regression, Ranger’s implementation of Random Forest, and XG Boost to determine which algorithm will be used going forward.

Lasso Linear Regression

split_clean_train_df <- f_partition(clean_train_df, test_proportion = 0.2, seed = 123456)
split_clean_train_df$train$date = NULL
split_clean_train_df$test$date = NULL

glmnet_cv<-cv.glmnet(x = data.matrix(split_clean_train_df$train[, !c('id','price')]), nfolds = 5, 
                     y = split_clean_train_df$train[['price']],
                     alpha=1, family = 'gaussian', standardize = T)
plot.cv.glmnet(glmnet_cv)

glmnet_cv$lambda.min
## [1] 300.9572
glmnet_0<-glmnet(x = data.matrix(split_clean_train_df$train[, !c('id','price')]), 
                 y = split_clean_train_df$train[['price']],
                 family = 'gaussian',
                 alpha=1, lambda = glmnet_cv$lambda.min)

print(glmnet_0)
## 
## Call:  glmnet(x = data.matrix(split_clean_train_df$train[, !c("id",      "price")]), y = split_clean_train_df$train[["price"]], family = "gaussian",      alpha = 1, lambda = glmnet_cv$lambda.min) 
## 
##      Df   %Dev Lambda
## [1,] 21 0.7467    301
glmnet_0$beta
## 22 x 1 sparse Matrix of class "dgCMatrix"
##                          s0
## bedrooms      -1.809343e+04
## bathrooms      2.520548e+04
## sqft_living    1.157481e+02
## sqft_lot       4.479120e-01
## floors         2.318851e+04
## waterfront     2.048945e+05
## view           4.510006e+04
## condition      2.692461e+04
## grade          9.492991e+04
## sqft_above     1.299709e+00
## sqft_basement -9.948718e+00
## yr_built      -2.217149e+03
## yr_renovated   1.958915e+01
## zipcode       -4.090499e+02
## lat            5.681984e+05
## long          -1.367021e+05
## sqft_living15  4.209482e+01
## sqft_lot15    -9.499088e-01
## year           2.948837e+04
## month          .           
## day           -1.298241e+02
## day_of_week    1.656678e+03
test_glmnet<-predict(glmnet_0, newx = data.matrix(split_clean_train_df$test[,!c('id','price')]))

df_pred<-split_clean_train_df$test[, .(id=1:.N, price, test_glmnet)]
str(df_pred)
## Classes 'data.table' and 'data.frame':   3456 obs. of  3 variables:
##  $ id         : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ price      : num  810000 953007 495000 1080000 705000 ...
##  $ test_glmnet: num  870918 749509 241732 864831 833831 ...
##  - attr(*, ".internal.selfref")=<externalptr>
rmse_glmnet<-rmse(real=split_clean_train_df$test$price, predicted = test_glmnet)
mae_glmnet<-mae(real=split_clean_train_df$test$price, predicted = test_glmnet)
mape_glmnet<-mape(real=split_clean_train_df$test$price, predicted = test_glmnet)
mape_glmnet
## [1] 0.2211987
rsq_glment<-custom_rsq(real=split_clean_train_df$test$price, predicted = test_glmnet)
rsq_glment
## [1] 0.7629821

Ranger Random Forest

baseline_rf <- ranger(formula = as.formula(price~.), data=split_clean_train_df$train[,!c('id')], importance = 'impurity')
print(baseline_rf)
## Ranger result
## 
## Call:
##  ranger(formula = as.formula(price ~ .), data = split_clean_train_df$train[,      !c("id")], importance = "impurity") 
## 
## Type:                             Regression 
## Number of trees:                  500 
## Sample size:                      13821 
## Number of independent variables:  22 
## Mtry:                             4 
## Target node size:                 5 
## Variable importance mode:         impurity 
## Splitrule:                        variance 
## OOB prediction error (MSE):       9872336587 
## R squared (OOB):                  0.8799315
test_rf<-predict(baseline_rf, data = split_clean_train_df$test, type='response')$predictions

df_pred<-cbind(df_pred, test_rf)
str(df_pred)
## Classes 'data.table' and 'data.frame':   3456 obs. of  4 variables:
##  $ id         : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ price      : num  810000 953007 495000 1080000 705000 ...
##  $ test_glmnet: num  870918 749509 241732 864831 833831 ...
##  $ test_rf    : num  975215 1064661 415067 861272 663111 ...
##  - attr(*, ".internal.selfref")=<externalptr>
rmse_rf<-rmse(real=split_clean_train_df$test$price, predicted = test_rf)
mae_rf<-mae(real=split_clean_train_df$test$price, predicted = test_rf)
mape_rf<-mape(real=split_clean_train_df$test$price, predicted = test_rf)
mape_rf
## [1] 0.1255336
rsq_rf<-custom_rsq(real=split_clean_train_df$test$price, predicted = test_rf)
rsq_rf
## [1] 0.893551

XG Boost

xgb_reg_0<-xgboost(booster='gblinear',
                   data=data.matrix(split_clean_train_df$train[, !c('id','price'), with=F]),
                   label=split_clean_train_df$train$price,
                   nrounds = 100,
                   objective='reg:linear')
## [1]  train-rmse:230610.468750 
## [2]  train-rmse:220846.671875 
## [3]  train-rmse:215033.453125 
## [4]  train-rmse:211002.625000 
## [5]  train-rmse:208047.359375 
## [6]  train-rmse:205814.953125 
## [7]  train-rmse:204087.859375 
## [8]  train-rmse:202721.671875 
## [9]  train-rmse:201616.375000 
## [10] train-rmse:200701.296875 
## [11] train-rmse:199925.406250 
## [12] train-rmse:199251.656250 
## [13] train-rmse:198653.796875 
## [14] train-rmse:198112.062500 
## [15] train-rmse:197612.375000 
## [16] train-rmse:197144.781250 
## [17] train-rmse:196701.953125 
## [18] train-rmse:196278.828125 
## [19] train-rmse:195871.421875 
## [20] train-rmse:195477.031250 
## [21] train-rmse:195093.859375 
## [22] train-rmse:194720.781250 
## [23] train-rmse:194356.812500 
## [24] train-rmse:194000.734375 
## [25] train-rmse:193653.250000 
## [26] train-rmse:193312.703125 
## [27] train-rmse:192980.125000 
## [28] train-rmse:192654.453125 
## [29] train-rmse:192335.718750 
## [30] train-rmse:192024.359375 
## [31] train-rmse:191719.171875 
## [32] train-rmse:191420.765625 
## [33] train-rmse:191129.203125 
## [34] train-rmse:190844.078125 
## [35] train-rmse:190565.375000 
## [36] train-rmse:190292.812500 
## [37] train-rmse:190026.687500 
## [38] train-rmse:189766.015625 
## [39] train-rmse:189511.875000 
## [40] train-rmse:189262.625000 
## [41] train-rmse:189019.421875 
## [42] train-rmse:188782.031250 
## [43] train-rmse:188549.859375 
## [44] train-rmse:188322.984375 
## [45] train-rmse:188100.796875 
## [46] train-rmse:187884.015625 
## [47] train-rmse:187672.125000 
## [48] train-rmse:187465.093750 
## [49] train-rmse:187262.312500 
## [50] train-rmse:187064.140625 
## [51] train-rmse:186870.234375 
## [52] train-rmse:186680.687500 
## [53] train-rmse:186495.312500 
## [54] train-rmse:186314.046875 
## [55] train-rmse:186136.015625 
## [56] train-rmse:185962.453125 
## [57] train-rmse:185792.796875 
## [58] train-rmse:185626.546875 
## [59] train-rmse:185463.421875 
## [60] train-rmse:185303.781250 
## [61] train-rmse:185147.796875 
## [62] train-rmse:184995.078125 
## [63] train-rmse:184845.296875 
## [64] train-rmse:184698.453125 
## [65] train-rmse:184554.796875 
## [66] train-rmse:184413.734375 
## [67] train-rmse:184275.859375 
## [68] train-rmse:184140.515625 
## [69] train-rmse:184007.875000 
## [70] train-rmse:183877.703125 
## [71] train-rmse:183750.187500 
## [72] train-rmse:183624.937500 
## [73] train-rmse:183502.000000 
## [74] train-rmse:183381.531250 
## [75] train-rmse:183263.171875 
## [76] train-rmse:183147.406250 
## [77] train-rmse:183033.312500 
## [78] train-rmse:182921.343750 
## [79] train-rmse:182811.531250 
## [80] train-rmse:182703.828125 
## [81] train-rmse:182598.234375 
## [82] train-rmse:182493.468750 
## [83] train-rmse:182391.328125 
## [84] train-rmse:182290.593750 
## [85] train-rmse:182191.750000 
## [86] train-rmse:182094.546875 
## [87] train-rmse:181999.046875 
## [88] train-rmse:181904.984375 
## [89] train-rmse:181812.250000 
## [90] train-rmse:181721.406250 
## [91] train-rmse:181632.250000 
## [92] train-rmse:181544.140625 
## [93] train-rmse:181457.312500 
## [94] train-rmse:181371.671875 
## [95] train-rmse:181287.812500 
## [96] train-rmse:181205.171875 
## [97] train-rmse:181123.500000 
## [98] train-rmse:181043.359375 
## [99] train-rmse:180964.203125 
## [100]    train-rmse:180886.375000
print(xgb_reg_0)
## ##### xgb.Booster
## raw: 488 bytes 
## call:
##   xgb.train(params = params, data = dtrain, nrounds = nrounds, 
##     watchlist = watchlist, verbose = verbose, print_every_n = print_every_n, 
##     early_stopping_rounds = early_stopping_rounds, maximize = maximize, 
##     save_period = save_period, save_name = save_name, xgb_model = xgb_model, 
##     callbacks = callbacks, booster = "gblinear", objective = "reg:linear")
## params (as set within xgb.train):
##   booster = "gblinear", objective = "reg:linear", silent = "1"
## xgb.attributes:
##   niter
## callbacks:
##   cb.print.evaluation(period = print_every_n)
##   cb.evaluation.log()
## # of features: 22 
## niter: 100
## nfeatures : 22 
## evaluation_log:
##     iter train_rmse
##        1   230610.5
##        2   220846.7
## ---                
##       99   180964.2
##      100   180886.4
test_xgb<-predict(xgb_reg_0, newdata = data.matrix(split_clean_train_df$test[, !c('id','price'), with=F]), 
                  type='response')

df_pred<-cbind(df_pred, test_xgb)
str(df_pred)
## Classes 'data.table' and 'data.frame':   3456 obs. of  5 variables:
##  $ id         : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ price      : num  810000 953007 495000 1080000 705000 ...
##  $ test_glmnet: num  870918 749509 241732 864831 833831 ...
##  $ test_rf    : num  975215 1064661 415067 861272 663111 ...
##  $ test_xgb   : num  878559 716272 366077 1020515 695939 ...
##  - attr(*, ".internal.selfref")=<externalptr>
rmse_xgb<-rmse(real=split_clean_train_df$test$price, predicted = test_xgb)
mae_xgb<-mae(real=split_clean_train_df$test$price, predicted = test_xgb)
mape_xgb<-mape(real=split_clean_train_df$test$price, predicted = test_xgb)
mape_xgb
## [1] 0.3042548
rsq_xgb<-custom_rsq(real=split_clean_train_df$test$price, predicted = test_xgb)
rsq_xgb
## [1] 0.6131065

Model Comparison

As can be seen from the following charts outlining each algorithm’s prediction metrics, Random Forest proved to have superior results when compared to the other two and will be used for subsequent feature engineering and tuning.

metrics_plot(df_pred, c('glmnet','rf','xgb_reg'), verbose = T)

##    method     rmse      mae      mape      rsq
## 1:     rf 94173.38 62314.86 0.1255336 0.893551

Feature Engineering

A number of features will be created in the hopes their inclusion into the model will improve the overall prediction abilities. The features created include 1) Weekday/Weekend flag, 2) Holiday flag, 3) Renovation flag (defined as when the 2015 area of either the lot or living room is different from the original area), 4) Missing Renovation Year flag (as the presence of a renovation year should correspond to a positive renovation flag), and 5) House Age. It was found that the inclusion of the first, third, and fourth features actually improved model performance as seen from the below graphs.

1. Weekday/Weekend

df_pipeline_pred<-split_clean_train_df$test[, .(id=1:.N, price, test_rf)]
colnames(df_pipeline_pred) <-c('id','price','baseline')

fe_train_df1 <- clean_train_df
fe_test_df1 <- clean_test_df

fe_train_df1$weekend <-as.logical(is.weekend(clean_train_df$date))
fe_test_df1$weekend <-as.logical(is.weekend(clean_test_df$date))
fe_train_df1$date = NULL

fe_output_1 <- split_and_train(fe_train_df1, df_pipeline_pred)
## Ranger result
## 
## Call:
##  ranger(formula = as.formula(price ~ .), data = split_fe_train_df$train[,      !c("id")], importance = "impurity") 
## 
## Type:                             Regression 
## Number of trees:                  500 
## Sample size:                      13821 
## Number of independent variables:  23 
## Mtry:                             4 
## Target node size:                 5 
## Variable importance mode:         impurity 
## Splitrule:                        variance 
## OOB prediction error (MSE):       9932652150 
## R squared (OOB):                  0.8791979
metrics_plot(fe_output_1[[3]], c('baseline','fe1'), verbose = T)

##      method     rmse      mae      mape      rsq
## 1: baseline 94173.38 62314.86 0.1255336 0.893551

2. Holiday

fe_train_df2 <- clean_train_df
fe_test_df2 <- clean_test_df

fe_train_df2$holiday <-as.logical(is.holiday(clean_train_df$date))
fe_test_df2$holiday <-as.logical(is.holiday(clean_test_df$date))
fe_train_df2$date = NULL

fe_output_2 <- split_and_train(fe_train_df2, fe_output_1[[3]])
## Ranger result
## 
## Call:
##  ranger(formula = as.formula(price ~ .), data = split_fe_train_df$train[,      !c("id")], importance = "impurity") 
## 
## Type:                             Regression 
## Number of trees:                  500 
## Sample size:                      13821 
## Number of independent variables:  23 
## Mtry:                             4 
## Target node size:                 5 
## Variable importance mode:         impurity 
## Splitrule:                        variance 
## OOB prediction error (MSE):       9922857033 
## R squared (OOB):                  0.879317
metrics_plot(fe_output_2[[3]], c('baseline','fe1','fe2'), verbose = T)

##      method     rmse      mae      mape      rsq
## 1: baseline 94173.38 62314.86 0.1255336 0.893551
clean_test_df$date = NULL

3. Renovation Flag

fe_train_df3 <- clean_train_df
fe_test_df3 <- clean_test_df

fe_train_df3$renovated <- ifelse(((fe_train_df3$sqft_living != fe_train_df3$sqft_living15) | 
                                     (fe_train_df3$sqft_lot != fe_train_df3$sqft_lot15)), 1, 0)
fe_test_df3$rennovated <- ifelse(((fe_test_df3$sqft_living != fe_test_df3$sqft_living15) | 
                                     (fe_test_df3$sqft_lot != fe_test_df3$sqft_lot15)), 1, 0)

fe_output_3 <- split_and_train(fe_train_df3, fe_output_2[[3]])
## Ranger result
## 
## Call:
##  ranger(formula = as.formula(price ~ .), data = split_fe_train_df$train[,      !c("id")], importance = "impurity") 
## 
## Type:                             Regression 
## Number of trees:                  500 
## Sample size:                      13821 
## Number of independent variables:  24 
## Mtry:                             4 
## Target node size:                 5 
## Variable importance mode:         impurity 
## Splitrule:                        variance 
## OOB prediction error (MSE):       10101553722 
## R squared (OOB):                  0.8771437
metrics_plot(fe_output_3[[3]], c('baseline','fe1','fe2','fe3'), verbose = T)

##      method     rmse      mae      mape      rsq
## 1: baseline 94173.38 62314.86 0.1255336 0.893551

4. Missing Renovation Year

fe_train_df4 <- fe_train_df3
fe_test_df4 <- fe_test_df3

fe_train_df4$missing_ren_year <- ifelse(((fe_train_df4$yr_renovated == 0) & (fe_train_df4$renovated == T)), 1, 0)
fe_test_df4$missing_ren_year <- ifelse(((fe_test_df4$yr_renovated == 0) & (fe_test_df4$renovated == T)), 1, 0)

fe_output_4 <- split_and_train(fe_train_df4, fe_output_3[[3]])
## Ranger result
## 
## Call:
##  ranger(formula = as.formula(price ~ .), data = split_fe_train_df$train[,      !c("id")], importance = "impurity") 
## 
## Type:                             Regression 
## Number of trees:                  500 
## Sample size:                      13821 
## Number of independent variables:  25 
## Mtry:                             5 
## Target node size:                 5 
## Variable importance mode:         impurity 
## Splitrule:                        variance 
## OOB prediction error (MSE):       9763103935 
## R squared (OOB):                  0.88126
metrics_plot(fe_output_4[[3]], c('baseline','fe1','fe2','fe3','fe4'), verbose = T)

##    method     rmse      mae      mape       rsq
## 1:    fe4 93277.24 61544.41 0.1235152 0.8955673

5. House Age

fe_train_df5 <- clean_train_df
fe_test_df5 <- clean_test_df

fe_train_df5$house_age <- year(Sys.Date()) - fe_train_df5$yr_built
fe_test_df5$house_age <- year(Sys.Date()) - fe_test_df5$yr_built

fe_output_5 <- split_and_train(fe_train_df5, fe_output_4[[3]])
## Ranger result
## 
## Call:
##  ranger(formula = as.formula(price ~ .), data = split_fe_train_df$train[,      !c("id")], importance = "impurity") 
## 
## Type:                             Regression 
## Number of trees:                  500 
## Sample size:                      13821 
## Number of independent variables:  24 
## Mtry:                             4 
## Target node size:                 5 
## Variable importance mode:         impurity 
## Splitrule:                        variance 
## OOB prediction error (MSE):       10076946986 
## R squared (OOB):                  0.877443
metrics_plot(fe_output_5[[3]], c('baseline','fe1','fe2','fe3','fe4','fe5'), verbose = T)

##    method     rmse      mae      mape       rsq
## 1:    fe4 93277.24 61544.41 0.1235152 0.8955673

Combine Best Features

A combination of the first, third, and fourth engineered features are combined to train on the next model iteration. This combined model is found to be superior to any of the other models separately and therefore these features will be included in the tuning phase.

fe_train_df_final <- fe_train_df1
fe_test_df_final<- fe_test_df1

fe_train_df_final$renovated <- ifelse(((fe_train_df_final$sqft_living != fe_train_df_final$sqft_living15) | 
                                     (fe_train_df_final$sqft_lot != fe_train_df_final$sqft_lot15)), 1, 0)
fe_test_df_final$renovated <- ifelse(((fe_test_df_final$sqft_living != fe_test_df_final$sqft_living15) | 
                                     (fe_test_df_final$sqft_lot != fe_test_df_final$sqft_lot15)), 1, 0)

fe_train_df_final$missing_ren_year <- ifelse(((fe_train_df_final$yr_renovated == 0) 
                                              & (fe_train_df_final$renovated == T)), 1, 0)
fe_test_df_final$missing_ren_year <- ifelse(((fe_test_df_final$yr_renovated == 0) 
                                             & (fe_test_df_final$renovated == T)), 1, 0)

fe_output_final <- split_and_train(fe_train_df_final, fe_output_5[[3]])
## Ranger result
## 
## Call:
##  ranger(formula = as.formula(price ~ .), data = split_fe_train_df$train[,      !c("id")], importance = "impurity") 
## 
## Type:                             Regression 
## Number of trees:                  500 
## Sample size:                      13821 
## Number of independent variables:  25 
## Mtry:                             5 
## Target node size:                 5 
## Variable importance mode:         impurity 
## Splitrule:                        variance 
## OOB prediction error (MSE):       9750571539 
## R squared (OOB):                  0.8814124
metrics_plot(fe_output_final[[3]], c('baseline','fe1','fe2','fe3','fe4','fe5','final_fe'), verbose = T)

##    method     rmse      mae      mape       rsq
## 1:    fe4 93277.24 61544.41 0.1235152 0.8955673

Hyperparameter Tuning

Per the documentation, the TuneRanger package seeks to find the optimal minimum node size, sample fraction, and mtry for a given Ranger model. Rather than using the standard cross-validation folds, out-of-bag (OOB) predictions are used for faster performance time. This final tuned model further improves the performance from previous iterations with a MAPE on the holdout of approximately bold 0.1152482 bold .

# test_rf_tuned <- csrf(
#   formula = as.formula(price~.),
#   training_data = split_clean_train_df$train[,!c('id')],
#   test_data = split_clean_train_df$test[,!c('id')],
#   params1 = list(importance = 'impurity'),
#   params2 = list(num.trees = 50)
# )

##################################################### TESTING ###################################################### 
####################################################################################################################
####################################################################################################################
final_train_df <- fe_output_final[[1]]
final_test_df <- fe_test_df_final

#Need to convert to integers as task doesn't support categoricals
#Train/Test Split
final_train_df$train$weekend <- ifelse((final_train_df$train$weekend), 1, 0)
final_train_df$test$weekend <- ifelse((final_train_df$test$weekend), 1, 0)

#Validation Split
final_test_df$weekend <- ifelse((final_test_df$weekend), 1, 0)

task = makeRegrTask(data = final_train_df$train[,!c('id')], target = "price")
## Warning in makeTask(type = type, data = data, weights = weights, blocking
## = blocking, : Provided data is not a pure data.frame but from class
## data.table, hence it will be converted.
# Estimate runtime
estimateTimeTuneRanger(task)
## Approximated time for tuning: 3H 15M 23S
# Tuning
res = tuneRanger(task, num.trees = 500, num.threads = 2, iters = 50, save.file.path = NULL)
## Computing y column(s) for design. Not provided.
## [mbo] 0: mtry=9; min.node.size=151; sample.fraction=0.432 : y = 1.39e+10 : 6.0 secs : initdesign
## [mbo] 0: mtry=17; min.node.size=113; sample.fraction=0.838 : y = 1.12e+10 : 22.0 secs : initdesign
## [mbo] 0: mtry=3; min.node.size=25; sample.fraction=0.351 : y = 1.33e+10 : 6.7 secs : initdesign
## [mbo] 0: mtry=21; min.node.size=1.16e+03; sample.fraction=0.615 : y = 2.46e+10 : 7.8 secs : initdesign
## [mbo] 0: mtry=10; min.node.size=8; sample.fraction=0.563 : y = 9.17e+09 : 28.5 secs : initdesign
## [mbo] 0: mtry=9; min.node.size=84; sample.fraction=0.644 : y = 1.14e+10 : 14.5 secs : initdesign
## [mbo] 0: mtry=11; min.node.size=1.62e+03; sample.fraction=0.436 : y = 3.24e+10 : 2.2 secs : initdesign
## [mbo] 0: mtry=15; min.node.size=11; sample.fraction=0.278 : y = 9.77e+09 : 14.1 secs : initdesign
## [mbo] 0: mtry=18; min.node.size=6; sample.fraction=0.735 : y = 8.96e+09 : 37.1 secs : initdesign
## [mbo] 0: mtry=2; min.node.size=330; sample.fraction=0.78 : y = 2.15e+10 : 2.0 secs : initdesign
## [mbo] 0: mtry=19; min.node.size=558; sample.fraction=0.461 : y = 1.96e+10 : 6.1 secs : initdesign
## [mbo] 0: mtry=11; min.node.size=5; sample.fraction=0.79 : y = 8.85e+09 : 32.7 secs : initdesign
## [mbo] 0: mtry=3; min.node.size=14; sample.fraction=0.506 : y = 1.18e+10 : 6.2 secs : initdesign
## [mbo] 0: mtry=14; min.node.size=412; sample.fraction=0.738 : y = 1.55e+10 : 6.3 secs : initdesign
## [mbo] 0: mtry=14; min.node.size=2; sample.fraction=0.585 : y = 8.86e+09 : 25.8 secs : initdesign
## [mbo] 0: mtry=6; min.node.size=157; sample.fraction=0.38 : y = 1.58e+10 : 4.8 secs : initdesign
## [mbo] 0: mtry=23; min.node.size=62; sample.fraction=0.541 : y = 1.09e+10 : 21.5 secs : initdesign
## [mbo] 0: mtry=16; min.node.size=198; sample.fraction=0.314 : y = 1.55e+10 : 6.9 secs : initdesign
## [mbo] 0: mtry=20; min.node.size=3; sample.fraction=0.405 : y = 9.13e+09 : 32.0 secs : initdesign
## [mbo] 0: mtry=5; min.node.size=4; sample.fraction=0.229 : y = 1.1e+10 : 5.6 secs : initdesign
## [mbo] 0: mtry=18; min.node.size=41; sample.fraction=0.811 : y = 9.76e+09 : 20.5 secs : initdesign
## [mbo] 0: mtry=4; min.node.size=2; sample.fraction=0.669 : y = 9.98e+09 : 10.8 secs : initdesign
## [mbo] 0: mtry=23; min.node.size=1.89e+03; sample.fraction=0.338 : y = 3.65e+10 : 1.7 secs : initdesign
## [mbo] 0: mtry=7; min.node.size=569; sample.fraction=0.621 : y = 1.93e+10 : 2.6 secs : initdesign
## [mbo] 0: mtry=25; min.node.size=32; sample.fraction=0.206 : y = 1.11e+10 : 8.7 secs : initdesign
## [mbo] 0: mtry=8; min.node.size=2.33e+03; sample.fraction=0.267 : y = 3.76e+10 : 0.9 secs : initdesign
## [mbo] 0: mtry=24; min.node.size=3; sample.fraction=0.857 : y = 9.94e+09 : 78.7 secs : initdesign
## [mbo] 0: mtry=1; min.node.size=20; sample.fraction=0.887 : y = 2.57e+10 : 3.6 secs : initdesign
## [mbo] 0: mtry=22; min.node.size=924; sample.fraction=0.698 : y = 2.14e+10 : 8.6 secs : initdesign
## [mbo] 0: mtry=13; min.node.size=17; sample.fraction=0.486 : y = 9.41e+09 : 25.7 secs : initdesign
## [mbo] 1: mtry=10; min.node.size=3; sample.fraction=0.505 : y = 9.09e+09 : 21.1 secs : infill_cb
## [mbo] 2: mtry=15; min.node.size=2; sample.fraction=0.799 : y = 8.82e+09 : 45.2 secs : infill_cb
## [mbo] 3: mtry=24; min.node.size=2; sample.fraction=0.202 : y = 9.83e+09 : 17.5 secs : infill_cb
## [mbo] 4: mtry=13; min.node.size=3; sample.fraction=0.692 : y = 8.86e+09 : 35.6 secs : infill_cb
## [mbo] 5: mtry=9; min.node.size=2; sample.fraction=0.426 : y = 9.34e+09 : 14.2 secs : infill_cb
## [mbo] 6: mtry=20; min.node.size=4; sample.fraction=0.784 : y = 9.07e+09 : 41.0 secs : infill_cb
## [mbo] 7: mtry=11; min.node.size=2; sample.fraction=0.737 : y = 8.87e+09 : 26.0 secs : infill_cb
## [mbo] 8: mtry=17; min.node.size=6; sample.fraction=0.452 : y = 9.1e+09 : 19.0 secs : infill_cb
## [mbo] 9: mtry=16; min.node.size=2; sample.fraction=0.785 : y = 8.87e+09 : 39.4 secs : infill_cb
## [mbo] 10: mtry=25; min.node.size=2; sample.fraction=0.44 : y = 9.27e+09 : 38.5 secs : infill_cb
## [mbo] 11: mtry=11; min.node.size=5; sample.fraction=0.616 : y = 8.95e+09 : 25.4 secs : infill_cb
## [mbo] 12: mtry=7; min.node.size=2; sample.fraction=0.564 : y = 9.25e+09 : 16.7 secs : infill_cb
## [mbo] 13: mtry=19; min.node.size=2; sample.fraction=0.41 : y = 9.1e+09 : 30.7 secs : infill_cb
## [mbo] 14: mtry=14; min.node.size=5; sample.fraction=0.78 : y = 8.84e+09 : 35.1 secs : infill_cb
## [mbo] 15: mtry=19; min.node.size=13; sample.fraction=0.436 : y = 9.34e+09 : 18.8 secs : infill_cb
## [mbo] 16: mtry=12; min.node.size=2; sample.fraction=0.777 : y = 8.86e+09 : 30.0 secs : infill_cb
## [mbo] 17: mtry=18; min.node.size=2; sample.fraction=0.774 : y = 8.92e+09 : 41.0 secs : infill_cb
## [mbo] 18: mtry=18; min.node.size=35; sample.fraction=0.586 : y = 9.76e+09 : 16.0 secs : infill_cb
## [mbo] 19: mtry=25; min.node.size=14; sample.fraction=0.751 : y = 9.86e+09 : 35.8 secs : infill_cb
## [mbo] 20: mtry=25; min.node.size=5; sample.fraction=0.327 : y = 9.51e+09 : 19.1 secs : infill_cb
## [mbo] 21: mtry=11; min.node.size=2; sample.fraction=0.662 : y = 8.83e+09 : 23.3 secs : infill_cb
## [mbo] 22: mtry=4; min.node.size=2; sample.fraction=0.452 : y = 1.04e+10 : 8.7 secs : infill_cb
## [mbo] 23: mtry=9; min.node.size=4; sample.fraction=0.691 : y = 8.97e+09 : 17.1 secs : infill_cb
## [mbo] 24: mtry=17; min.node.size=6; sample.fraction=0.792 : y = 8.94e+09 : 30.0 secs : infill_cb
## [mbo] 25: mtry=13; min.node.size=2; sample.fraction=0.776 : y = 8.83e+09 : 29.6 secs : infill_cb
## [mbo] 26: mtry=15; min.node.size=2; sample.fraction=0.396 : y = 9.14e+09 : 32.9 secs : infill_cb
## [mbo] 27: mtry=19; min.node.size=74; sample.fraction=0.742 : y = 1.07e+10 : 36.8 secs : infill_cb
## [mbo] 28: mtry=9; min.node.size=7; sample.fraction=0.455 : y = 9.41e+09 : 10.7 secs : infill_cb
## [mbo] 29: mtry=23; min.node.size=3; sample.fraction=0.505 : y = 9.13e+09 : 40.6 secs : infill_cb
## [mbo] 30: mtry=9; min.node.size=2; sample.fraction=0.636 : y = 9e+09 : 31.1 secs : infill_cb
## [mbo] 31: mtry=18; min.node.size=5; sample.fraction=0.799 : y = 8.99e+09 : 59.8 secs : infill_cb
## [mbo] 32: mtry=14; min.node.size=3; sample.fraction=0.747 : y = 8.84e+09 : 45.9 secs : infill_cb
## [mbo] 33: mtry=24; min.node.size=2; sample.fraction=0.675 : y = 9.34e+09 : 90.9 secs : infill_cb
## [mbo] 34: mtry=16; min.node.size=2; sample.fraction=0.557 : y = 8.89e+09 : 50.3 secs : infill_cb
## [mbo] 35: mtry=21; min.node.size=2; sample.fraction=0.52 : y = 9.04e+09 : 47.6 secs : infill_cb
## [mbo] 36: mtry=11; min.node.size=3; sample.fraction=0.771 : y = 8.82e+09 : 41.7 secs : infill_cb
## [mbo] 37: mtry=14; min.node.size=2; sample.fraction=0.69 : y = 8.86e+09 : 46.9 secs : infill_cb
## [mbo] 38: mtry=11; min.node.size=4; sample.fraction=0.723 : y = 8.86e+09 : 44.7 secs : infill_cb
## [mbo] 39: mtry=15; min.node.size=2; sample.fraction=0.5 : y = 8.96e+09 : 41.8 secs : infill_cb
## [mbo] 40: mtry=11; min.node.size=3; sample.fraction=0.669 : y = 8.86e+09 : 39.4 secs : infill_cb
## [mbo] 41: mtry=18; min.node.size=7; sample.fraction=0.527 : y = 9.03e+09 : 29.9 secs : infill_cb
## [mbo] 42: mtry=14; min.node.size=3; sample.fraction=0.788 : y = 8.86e+09 : 40.6 secs : infill_cb
## [mbo] 43: mtry=13; min.node.size=5; sample.fraction=0.745 : y = 8.81e+09 : 40.3 secs : infill_cb
## [mbo] 44: mtry=16; min.node.size=6; sample.fraction=0.682 : y = 8.86e+09 : 46.6 secs : infill_cb
## [mbo] 45: mtry=13; min.node.size=2; sample.fraction=0.223 : y = 9.82e+09 : 15.1 secs : infill_cb
## [mbo] 46: mtry=15; min.node.size=2; sample.fraction=0.766 : y = 8.84e+09 : 55.5 secs : infill_cb
## [mbo] 47: mtry=13; min.node.size=4; sample.fraction=0.761 : y = 8.81e+09 : 49.2 secs : infill_cb
## [mbo] 48: mtry=22; min.node.size=2; sample.fraction=0.428 : y = 9.14e+09 : 41.2 secs : infill_cb
## [mbo] 49: mtry=16; min.node.size=2; sample.fraction=0.66 : y = 8.85e+09 : 53.6 secs : infill_cb
## [mbo] 50: mtry=11; min.node.size=5; sample.fraction=0.757 : y = 8.88e+09 : 39.6 secs : infill_cb
# Model with the new tuned hyperparameters
res$model
## Model for learner.id=regr.ranger; learner.class=regr.ranger
## Trained on: task.id = final_train_df$train[, !c("id")]; obs = 13821; features = 25
## Hyperparameters: num.threads=2,verbose=FALSE,respect.unordered.factors=order,mtry=13,min.node.size=4,sample.fraction=0.769,num.trees=500,replace=FALSE
# Prediction
final <- predict(res$model, newdata = final_train_df$test[,!c('id')])$data$response
## Warning in predict.WrappedModel(res$model, newdata =
## final_train_df$test[, : Provided data for prediction is not a pure
## data.frame but from class data.table, hence it will be converted.
df_pipeline_final<-cbind(fe_output_final[[3]], final)

metrics_plot(df_pipeline_final, c('baseline','fe1','fe2','fe3','fe4','fe5','final_fe','final_model'), verbose = T)

##         method     rmse      mae      mape       rsq
## 1: final_model 88726.88 57831.07 0.1152482 0.9055079

Retrain Model On Entire Data Set and Predict on Test Set

The following sections retrain the model using the feature engineering and optimal hyperparamters from previous sections on the entire training set. This retrained model is then used to make predictions on the validation set, which is finally prepared for the final CSV format.

final_total_train<- rbind(final_train_df$train, final_train_df$test)

#Running this was computationally expensive and ultimately unsuccessful
# fit_control <- trainControl(method = "cv", number = 3, verboseIter = TRUE, search = "random")
# final_rf <- train(as.factor(price) ~ ., data = final_total_train[,!c('id')],
#                  method = "ranger",
#                  trControl = fit_control)

final_rf <- ranger(formula = as.formula(price~.), data=final_total_train[,!c('id')], 
                   importance = 'impurity',
                   mtry = res$recommended.pars$mtry, 
                   min.node.size = res$recommended.pars$min.node.size,
                   sample.fraction = res$recommended.pars$sample.fraction)
## Growing trees.. Progress: 73%. Estimated remaining time: 11 seconds.
print(final_rf)
## Ranger result
## 
## Call:
##  ranger(formula = as.formula(price ~ .), data = final_total_train[,      !c("id")], importance = "impurity", mtry = res$recommended.pars$mtry,      min.node.size = res$recommended.pars$min.node.size, sample.fraction = res$recommended.pars$sample.fraction) 
## 
## Type:                             Regression 
## Number of trees:                  500 
## Sample size:                      17277 
## Number of independent variables:  25 
## Mtry:                             13 
## Target node size:                 4 
## Variable importance mode:         impurity 
## Splitrule:                        variance 
## OOB prediction error (MSE):       8543289410 
## R squared (OOB):                  0.8963732
final_test_rf<-predict(final_rf, data = final_test_df, type='response')$predictions
prediction<-clean_test_df[, .(id=id,final_test_rf)]
head(prediction)
##            id final_test_rf
## 1: 6414100192      481312.9
## 2: 6054650070      399438.2
## 3:   16000397      208393.7
## 4: 2524049179     1205436.0
## 5: 8562750320      602436.8
## 6: 7589200193      521044.3

Variable Importance

This chart displays the variable importance sorted by node impurity (i.e. the variation generated when observations reach that variable). Many of the most important factors influencing house pricing are intuitive (i.e. the size/area has a clear positive relationship with price).

importance_df <- data.frame(final_rf$variable.importance)
setDT(importance_df, keep.rownames = TRUE)[]
##                   rn final_rf.variable.importance
##  1:         bedrooms                 3.022104e+12
##  2:        bathrooms                 1.531373e+13
##  3:      sqft_living                 2.327707e+14
##  4:         sqft_lot                 1.403515e+13
##  5:           floors                 2.182092e+12
##  6:       waterfront                 7.469033e+12
##  7:             view                 1.943842e+13
##  8:        condition                 4.159362e+12
##  9:            grade                 2.998744e+14
## 10:       sqft_above                 4.252713e+13
## 11:    sqft_basement                 8.176851e+12
## 12:         yr_built                 3.602017e+13
## 13:     yr_renovated                 1.850260e+12
## 14:          zipcode                 2.312867e+13
## 15:              lat                 2.295265e+14
## 16:             long                 4.790245e+13
## 17:    sqft_living15                 6.870708e+13
## 18:       sqft_lot15                 1.471390e+13
## 19:             year                 1.253822e+12
## 20:            month                 5.634836e+12
## 21:              day                 7.456911e+12
## 22:      day_of_week                 3.607965e+12
## 23:          weekend                 2.289366e+11
## 24:        renovated                 1.787497e+11
## 25: missing_ren_year                 6.487339e+11
##                   rn final_rf.variable.importance
colnames(importance_df) <- c('variable', 'importance')

ggplot(importance_df, aes(x=reorder(variable,importance), y=importance, fill=importance)) + 
    geom_bar(stat="identity", position="dodge")+ coord_flip()+
    ylab("Variable Importance")+
    xlab("")+
    ggtitle("Information Value Summary")+
    guides(fill=F)+
    scale_fill_gradient(low="red", high="blue")

CSV Output

colnames(prediction) <- c('id', 'target')
write.csv(prediction, file = "output.csv")